knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
# Load libraries
# library(ggchromatic)
library(scater)
library(ggpubr)
library(igraph)
library(dittoSeq)
library(Rphenograph)
library(aricode)
library(viridis)
library(ggrepel)
library(LaplacesDemon)
# Set general input paths to all analysis files
analysis.path <- "/mnt/bb_dqbm_volume/analysis"
# Set path to masks
tonsil.path <- "/mnt/bb_dqbm_volume/data/Tonsil_th152"
lung.path <- "/mnt/bb_dqbm_volume/data/Immucan_lung"
# Import helper fncs
source("../analysis/helpers/helper_fnc.R")
first, we look at U computed using different training datasets. For this, we read in the results from CISI training on the Tonsil th152 and the Immucan lung dataset (tonsil_vs_lung.py).
## Read results
# Read in all results for tonsil and lung comparison into one dataframe
files <- list.files(analysis.path, "simulation_results.txt",
full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
files <- files[grepl(
"tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", files)]
res <- lapply(files, read_results, type="res") %>%
bind_rows()
# Get information about best U files
# u_files <- res %>%
# dplyr::filter(dataset==training) %>%
# dplyr::select(k, training, `Best crossvalidation fold`, datasize) %>%
# distinct()
# Harmonize all dataset names
patterns <- unique(unlist(lapply(res$training, function(name){
if(length(str_split(name, "_")[[1]])==1){
name
}
})))
res$training <- unlist(lapply(res$training, function(pat){
patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))
res$dataset <- unlist(lapply(res$dataset, function(pat){
patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))
## Read U
# u <- lapply(1:(nrow(u_files)), function(i){
# file <- file.path(analysis.path, u_files[i, "training"], "training",
# u_files[i, "datasize"],
# paste0("k_", u_files[i, "k"]),
# paste0("crossvalidation_", u_files[i, "Best crossvalidation fold"]),
# "gene_modules.csv")
# u_temp <- read_U(file, "training")
# })
u.files <- list.files(analysis.path, "gene_modules.csv",
full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
u.files <- u.files[grepl(
"tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", u.files)]
u <- lapply(u.files, read_U, type="training") %>%
bind_rows() %>%
dplyr::rename(dataset=rep)
## Read X_test and X_simulated
# Specify X_test and X_simulated files to be read in
X.files <- list.files(analysis.path, "X_", full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
X.files <- X.files[grepl(
"tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", X.files)]
X.files <- X.files[!grepl("_processed", X.files)]
# Read processed SCE to avoid computing UMAP, TSNE, ... all the time since it is computationally
# expensive
# X.files <- X.files[grepl("_processed", X.files)]
# Read in all sce experiments from saved anndata and save additional info in metadata
# (e.g. which dataset is used in training and testing, which k was used and if it
# is the ground truth or simulated X)
# Remove neg. values and transform counts?
X.list <- lapply(X.files, read_results, type="x")
X.list <- lapply(X.list, function(sce.temp){
pat <- metadata(sce.temp)
metadata(sce.temp)$dataset <- patterns[str_detect(pat$dataset,
fixed(patterns, ignore_case=TRUE))]
metadata(sce.temp)$training <- patterns[str_detect(pat$training,
fixed(patterns, ignore_case=TRUE))]
assay.name <- names(assays(sce.temp))
for (a in assay.name[-1]) {
assay(sce.temp, a) <- NULL
}
assay(sce.temp, "exprs") <- asinh(counts(sce.temp)/1)
sce.temp
})
# Add reduced dimensions
set.seed(220225)
X.list <- lapply(X.list, function(sce){
sce <- runUMAP(sce, exprs_values="exprs")
sce <- runTSNE(sce, exprs_values="exprs")
sce
})
# Save SCE to avoid computing UMAP, TSNE, ... all the time since it is computationally
# expensive
# lapply(1:(length(X.list)), function(i){saveRDS(X.list[i],
# paste0(gsub("\\..*", "", X.files[i]),
# "_processed.rds"))})
## Read masks
# Read in masks for tonsil data
masks.tonsil <- loadImages(file.path(tonsil.path, "steinbock/masks_deepcell"), as.is = TRUE)
mcols(masks.tonsil) <- DataFrame(sample_id=names(masks.tonsil))
# Read in masks for lung data
masks.lung <- loadImages(file.path(lung.path, "masks"), as.is = TRUE)
mcols(masks.lung) <- DataFrame(sample_id=names(masks.lung))
# For each different measurement of training results, plot a barplot comparing
# diff. k-sparsities, simulation types and training-test set combinations
for (measure in names(res)[2:10]) {
cat("##### ", measure, "\n")
# Reorder dataframe for plotting
data_temp <- res %>%
dplyr::select(dataset, training, simulation, k, !!measure) %>%
mutate(group=paste(training, dataset, sep="_"))
# Create barplot
results.barplot <- plot_cisi_results(data_temp, "group", measure, "k")
print(results.barplot)
cat("\n\n")
}
cor <- plot_U(u, "k", "dataset")
plot_U_membership(u, "k", "dataset")
# Calculate mantel test between U's of tonsil and lung for all k's
mantel <- lapply(cor, function(l){
mantel_test(l[[1]], l[[2]])$mantel_r
}) %>%
as.data.frame(col.names=names(cor), check.names=FALSE)
knitr::kable(mantel, digits=2,
col.names=paste0("k = ", names(mantel)))
| k = 1 | k = 3 |
|---|---|
| 0.66 | 0.51 |
# Subset to training and test of tonsil data
X.plotList <- keep(X.list, function(x){
metadata(x)$training=="tonsil" & metadata(x)$dataset=="tonsil" & metadata(x)$k=="1"})
names(X.plotList) <- lapply(X.plotList,
function(x){metadata(x)$ground_truth})
# Call plot_cells to get individual plots for each roi for decomposed and true
# results
X.imgList <- plot_cells(X.plotList, masks.tonsil,
"CD20")
# Plot decomposed vs true results for test roi (002)
X.img <- plot_grid(plotlist=append(X.imgList[grepl("20220520_TsH_th152_cisi1_002",
names(X.imgList))],
X.imgList[grepl("legend",
names(X.imgList))]),
ncol=2, labels=names(X.plotList),
label_size=15, hjust=c(-2, -1.5),
vjust=1, scale=0.9)
X.img
# Rename list of tonsil data for nicer titles in plotting
X.plotListRenamed <- lapply(names(X.plotList), function(n){
rownames(X.plotList[[n]]) <- paste(rownames(X.plotList[[n]]),
n, sep="\n")
reducedDims(X.plotList[[n]]) <- NULL
X.plotList[[n]]
})
names(X.plotListRenamed) <- names(X.plotList)
# Add decomposed and true dataset of tonsil into one SCE
X.overlayed <- do.call("rbind", X.plotListRenamed)
# Define protein of interest
pois <- c("CD20\nsimulated", "CD20\nground_truth")
# Plot cells coloured according to decomposed and true poi values
# (if similar in both should have a mixture of colours, e.g. orange)
X.imgDiff <- plotCells(mask=masks.tonsil[unique(colData(X.overlayed)$sample_id)],
object=X.overlayed,
cell_id="ObjectNumber", img_id="sample_id", colour_by=pois,
return_plot=TRUE, image_title=list(cex=1),
scale_bar=list(cex=1, lwidth=5),
legend=list(colour_by.title.cex=0.7, colour_by.labels.cex=0.7))
ggdraw(X.imgDiff$plot)
Plot of arcsinh transformed counts coloured by defined celltype.
# Subset to dataset trained and tested on the Immucan lung dataset
X.exprsList <- keep(X.list, function(x){
metadata(x)$training=="lung" & metadata(x)$dataset=="lung"})
names(X.exprsList) <- lapply(X.exprsList,
function(x){metadata(x)$ground_truth})
k_s <- unique(unlist(lapply(X.exprsList, function(sce_temp){metadata(sce_temp)$k})))
for (k in k_s) {
cat("##### k =", k, "\n")
# Subset for k of interest
X.exprsListK <- keep(X.exprsList, function(x){metadata(x)$k==k})
# Plot asinh transformed counts of proteins of interest of decomposed and true
# datasets coloured by different celltypes
X.Exprs <- plot_grid(plot_exprs(X.exprsListK, "B", "CD3", "CD20"),
plot_exprs(X.exprsListK, "BnT", "CD3", "CD20"),
plot_exprs(X.exprsListK, "Neutro", "MPO", "CD15"),
ncol=1)
print(X.Exprs)
cat("\n\n")
}
## Per protein results
Scatterplot of ground truth vs decomposed results per protein for k=1 and asinh transformed counts.
# Add all X_test counts together into dataframe for easier plotting
aoi <- "exprs"
X.df <- lapply(X.list, function(sce){
counts.long <- as.data.frame(assays(sce)[[aoi]]) %>%
mutate(protein=rownames(.)) %>%
melt() %>%
dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
cell=variable) %>%
mutate(k=metadata(sce)$k,
dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
}) %>% bind_rows() %>%
group_by(protein, cell, k, dataset) %>%
summarise_all(na.omit)
# For each test/training dataset combination plot for each true vs decomposed
# results in scatter plot and add diagonal and regression line, as well as
# R (pearson correlation coefficient)
for (i in unique(X.df$dataset)) {
cat("#####", i, "\n")
proteinPlot <- ggscatter(X.df %>%
dplyr::filter(k=="1" & dataset==i),
x="ground_truth", y="simulated",
add="reg.line",
color=pal_npg("nrc")("1"),
add.params=list(color=pal_npg("nrc")("4")[4],
size=2)) +
facet_wrap(~ protein, scales="free", ncol=5) +
theme_cowplot(10) +
geom_abline(slope=1, linetype="dashed") +
stat_cor(size=2)
print(proteinPlot)
cat("\n\n")
}
Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts.
# Calculate correlations between ground truth and simulated data for each protein
X.cor <- lapply(X.list, function(sce){
counts.long <- as.data.frame(assays(sce)[["counts"]]) %>%
mutate(protein=rownames(.)) %>%
melt() %>%
dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
cell=variable) %>%
mutate(k=metadata(sce)$k,
dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
}) %>% bind_rows() %>%
group_by(protein, cell, k, dataset) %>%
summarise_all(na.omit) %>%
group_by(k, dataset, protein) %>%
summarize(correlation=cor(ground_truth, simulated),
median=median(ground_truth),
coef_of_var=sd(ground_truth)/mean(ground_truth),
relative_var=var(ground_truth)/mean(ground_truth))
# Plot correlations for k=1 for each test/training dataset combination
for (i in unique(X.df$dataset)) {
cat("#####", i, "\n")
proteinCorr <- ggplot(X.cor %>%
dplyr::filter(k=="1" & dataset==i),
aes(x=protein, y=correlation, fill=protein)) +
geom_bar(stat="identity") +
theme_cowplot() +
scale_fill_igv() +
guides(fill=FALSE) +
coord_flip()
print(proteinCorr)
cat("\n\n")
}
Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts compared to protein characteristics of ground truth: Median, max and variance.
# Define characteristics of interest
coi <- c("median", "coef_of_var", "relative_var")
# Plot correlations for k=1 for each test/training dataset combination
for (i in coi) {
cat("#####", i, "\n")
# proteinChar <- ggscatter(X.cor %>%
# dplyr::filter(k=="1"),
# x=i, y="correlation",
# add="reg.line",
# color=pal_npg("nrc")("1"),
# add.params=list(color=pal_npg("nrc")("4")[4],
# size=2)) +
proteinChar <- ggplot(X.cor %>% dplyr::filter(k=="1"),
aes(x=!!sym(i), y=correlation, label=protein)) +
geom_point(color=pal_npg("nrc")("1")) +
facet_wrap(~ dataset, scales="free", ncol=2) +
theme_cowplot(10) +
stat_cor(size=2.8,
label.x.npc = "center",
label.y.npc = "bottom") +
stat_smooth(method="lm",
formula=y ~ log(x),
se=FALSE,
color=pal_npg("nrc")("4")[4], size=2) +
geom_text_repel(size=2.8)
print(proteinChar)
cat("\n\n")
}
Below the UMAP and TSNE of the arcsinh counts for the Immucan lung dataset are shown using k=1. Red indicates a high expression of the particular protein.
# Find highly correlated proteins that become more correlated afterwards
# X.corProtein <- lapply(X.list, function(sce){
# counts.long <- as.data.frame(cor(t(assays(sce)[[aoi]]))) %>%
# mutate(proteinA=rownames(.)) %>%
# melt() %>%
# dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
# proteinB=variable) %>%
# mutate(k=metadata(sce)$k,
# dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_")) %>%
# filter(!(proteinA==proteinB)) %>%
# mutate(combProtein=str_c(pmin(as.character(proteinA),
# as.character(proteinB)),
# pmax(as.character(proteinA),
# as.character(proteinB))),
# proteinB=as.character(proteinB)) %>%
# filter(!duplicated(combProtein)) %>%
# dplyr::select(-combProtein) %>%
# filter(grepl("lung", dataset) & k==1)
#
# }) %>% bind_rows() %>%
# group_by(proteinA, proteinB, k, dataset) %>%
# summarise_all(na.omit) %>%
# mutate(diff=simulated-ground_truth) %>%
# group_by(k, dataset) %>%
# dplyr::filter(grepl("lung", dataset) & !grepl("tonsil", dataset)) %>%
# dplyr::filter(ground_truth>0.6 & diff>0) %>%
# arrange(desc(diff))
aoi <- "exprs"
X.exprsList1 <- keep(X.exprsList, function(x){metadata(x)$k=="1"})
for(protein in rownames(X.exprsList1[[1]])){
cat("#####", protein, "\n")
# Create empty list for plots
X.redDimlist <- list()
# Extract for simulated and true results for UMAP and TSNE and plot and
# colour highly correlated ground truth cells
for(sce in X.exprsList1){
for(method in c("UMAP", "TSNE")){
# sce.temp <- reducedDims(sce)[[method]] %>%
# as.data.frame() %>%
# mutate(!!X.corProtein$proteinA[1] :=assays(sce)[[aoi]][X.corProtein$proteinA[1],],
# !!X.corProtein$proteinB[1] :=assays(sce)[[aoi]][X.corProtein$proteinB[1],])
#
# X.redDimlist <- append(X.redDimlist,
# list(ggplot(sce.temp, aes(V1, V2, colour=cmy_spec(!!sym(X.corProtein$proteinA[1]),
# !!sym(X.corProtein$proteinB[1])))) +
# geom_point(size=0.3) +
# theme_cowplot(7)))
X.redDimlist <- append(X.redDimlist,
list(dittoDimPlot(sce,
var=protein,
reduction.use=method,
size=0.2,
min.color="grey",
max.color=pal_npg(("nrc"))("1")) +
#scale_color_igv() +
theme_cowplot() +
theme(legend.position="bottom") +
guides(color=guide_legend(nrow=1))))
}
}
# Remove empty plot at the beginning and add legend to the end of the plot list
X.redDimlist <- c(lapply(X.redDimlist, function(p){
p + theme(legend.position="none")
}), list(get_legend(X.redDimlist[2])))
# Plot simulated and true reduced dim on top of each other
X.redDimPlot <- plot_grid(plotlist=X.redDimlist,
ncol=2, labels=rep(names(X.exprsList1), each=2),
label_size=15, hjust=c(-2, -1.5), vjust=1, scale=0.9)
print(X.redDimPlot)
cat("\n\n")
}
To investigate the clustering results of ground truth and simulated data a bit more, we show the overlap of clusters computed using Rphenograph (run with k=47 clusters) individually and compare which clusters overlap using a heatmap.
# Subset to data tested on lung
lung.list <- keep(X.list, function(x){
metadata(x)$dataset=="lung"})
names(lung.list) <- lapply(lung.list, function(sce){
paste(metadata(sce)$training, metadata(sce)$dataset,
metadata(sce)$k, metadata(sce)$ground_truth, sep="_")
})
# Compute clusters for each dataset using Rphenograph
all.clusters <- lapply(lung.list,
function(sce){
mat <- t(assay(sce, "exprs"))
out1 <- Rphenograph(mat, k=100)
clusters <- as.vector(membership(out1[[2]])) %>%
as.data.frame() %>%
dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=".") %>%
mutate(dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"),
k=metadata(sce)$k,
celltype=colData(sce)$celltype,
cell=colData(sce)$cell_id)
# cluster_celltypes <- clusters %>%
# mutate(celltype=colData(sce)$celltype) %>%
# group_by(!!as.symbol(metadata(sce)$ground_truth), dataset, k, celltype) %>%
# mutate(celltype_n=n()) %>%
# ungroup %>%
# group_by(!!as.symbol(metadata(sce)$ground_truth), dataset, k) %>%
# filter(celltype_n==max(celltype_n)) %>%
# ungroup %>%
# dplyr::select(-celltype_n) %>%
# distinct
#
# clusters <- clusters %>%
# left_join(cluster_celltypes)
if(metadata(sce)$ground_truth=="simulated"){
out2 <- Rphenograph(mat, k=100)
max_ari <- ARI(factor(membership(out1[[2]])), factor(membership(out2[[2]])))
clusters <- clusters %>%
mutate(max_ari=max_ari)
}
clusters
}) %>% bind_rows() %>%
group_by(k, dataset, cell, celltype) %>%
summarise_all(na.omit) %>%
ungroup
# Define celltype colours used by the rest of the script
celltypes_col <- pal_igv("default")("20")[1:length(unique(all.clusters$celltype))]
names(celltypes_col) <- unique(all.clusters$celltype)
celltypes_col <- celltypes_col[all.clusters %>% dplyr::pull(celltype) %>% levels]
# Define functions to get overview of celltypes per cluster as row and column annotations
get_anno_clusters <- function(clusters, i, d, direction="reverse", which_sim="row"){
col_anno <- HeatmapAnnotation(celltypes=anno_barplot(table(paste("gt", clusters %>%
dplyr::filter(k==i & dataset==d) %>%
pull(ground_truth)),
clusters %>%
dplyr::filter(k==i & dataset==d) %>%
pull(celltype)),
gp=gpar(fill=celltypes_col),
bar_width = 1,
height = unit(5, "cm")),
show_annotation_name=FALSE)
row_anno <- HeatmapAnnotation(celltypes=anno_barplot(table(paste("sim", clusters %>%
dplyr::filter(k==i & dataset==d) %>%
pull(simulated)),
clusters %>%
dplyr::filter(k==i & dataset==d) %>%
pull(celltype)),
gp=gpar(fill=celltypes_col),
bar_width=1,
width=unit(5, "cm"),
axis_param=list(direction=direction)),
show_annotation_name=FALSE,
which=which_sim)
list(col_anno, row_anno)
}
# Plot heatmaps showing overlap of clusters for each dataset and k
for (i in unique(all.clusters$k)){
cat("##### k =", i, "\n")
plot.new()
ht_list <- NULL
for (d in unique(all.clusters$dataset)){
# Get table of number of cells being in simulated cluster i and ground truth
# cluster j
temp.matrix <- table(paste("sim", all.clusters %>%
dplyr::filter(k==i & dataset==d) %>%
pull(simulated)),
paste("gt", all.clusters %>%
dplyr::filter(k==i & dataset==d) %>%
pull(ground_truth))) %>%
as.matrix()
temp.matrix <- log10(temp.matrix + 10)
# Get barplot annotations of celltypes
anno.list <- get_anno_clusters(all.clusters, i, d)
col_anno <- anno.list[[1]]
row_anno <- anno.list[[2]]
# Create heatmap
cluster.overlap <- Heatmap(temp.matrix,
show_row_dend=FALSE,
show_column_dend=FALSE,
top_annotation=col_anno,
left_annotation=row_anno,
col=magma(100),
column_title=d,
row_names_gp=gpar(fontsize = 8),
column_names_gp=gpar(fontsize=8),
column_title_gp=gpar(fontsize=10),
rect_gp=gpar(col="white", lwd=1))
# Add heatmap to list
ht_list <- append(ht_list,
list(grid.grabExpr(draw(cluster.overlap,
heatmap_legend_list=list(
Legend(labels=names(celltypes_col),
title="Celltypes",
legend_gp=gpar(fill=celltypes_col)))))))
}
print(plot_grid(plotlist=ht_list, nrow=1))
cat("\n\n")
}
We also compare the clusters of the ground truth data to the clsuters of the simulated data using the Kullback-Leibner divergence of the celltypes.
# Plot heatmaps showing overlap of clusters for each dataset and k
for (i in unique(all.clusters$k)){
cat("##### k =", i, "\n")
plot.new()
ht_list.kl <- NULL
for (d in unique(all.clusters$dataset)){
clusters.filtered <- all.clusters %>%
dplyr::filter(k==i & dataset==d) %>%
dplyr::select(-c("k", "dataset", "max_ari"))
clusters_sim.kl <- table(clusters.filtered %>%
dplyr::pull(celltype),
paste0("sim_", clusters.filtered %>%
dplyr::pull(simulated))) %>%
scale(center=FALSE, scale=colSums(.))
clusters_gt.kl <- table(clusters.filtered %>%
dplyr::pull(celltype),
paste0("gt_", clusters.filtered %>%
dplyr::pull(ground_truth))) %>%
scale(center=FALSE, scale=colSums(.))
clusters.kl <- apply(clusters_sim.kl, 2, function(x){
apply(clusters_gt.kl, 2, function(y){
KLD(x, y)$sum.KLD.py.px})
})
# Get barplot annotations of celltypes
anno.list <- get_anno_clusters(all.clusters, i, d)
col_anno <- anno.list[[1]]
row_anno <- anno.list[[2]]
# Create heatmap
clusters.klPlot <- Heatmap(t(clusters.kl),
show_row_dend=FALSE,
show_column_dend=FALSE,
top_annotation=col_anno,
left_annotation=row_anno,
col=rev(magma(100)),
column_title=d,
row_names_gp=gpar(fontsize = 8),
column_names_gp=gpar(fontsize=8),
column_title_gp=gpar(fontsize=10),
rect_gp=gpar(col="white", lwd=1))
# Add heatmap to list
ht_list.kl <- append(ht_list.kl,
list(grid.grabExpr(draw(clusters.klPlot))))
}
print(plot_grid(plotlist=ht_list.kl, nrow=1))
cat("\n\n")
}
Next, we assess clustering by looking at the median marker expression of each protein in each cluster and the compare it to the celltypes of each cluster. Ideally, each cluster has a median protein expression pattern that fits its main celltype.
for (i in unique(all.clusters$k)){
cat("##### k =", i, "\n")
plot.new()
ht_list.exprs <- NULL
for (d in unique(all.clusters$dataset)){
cluster_gt.exprs <- assay(lung.list[grepl(i, names(lung.list)) &
grepl(d, names(lung.list)) &
grepl("ground_truth", names(lung.list))][[1]], "exprs") %>%
as.data.frame %>%
mutate(protein=rownames(.)) %>%
melt() %>%
dplyr::rename(cell=variable, expr=value)
cluster_gt.exprs <- all.clusters %>%
dplyr::filter(k==i & dataset==d) %>%
merge(cluster_gt.exprs) %>%
dplyr::select(-c("k", "max_ari", "dataset", "celltype"))
cluster_gt.exprs <- cluster_gt.exprs %>%
mutate(ground_truth=paste("gt", ground_truth)) %>%
group_by(ground_truth, protein) %>%
summarise(mean_expr=mean(expr)) %>%
pivot_wider(names_from=ground_truth, values_from=mean_expr) %>%
column_to_rownames("protein") %>%
as.matrix
cluster_sim.exprs <- assay(lung.list[grepl(i, names(lung.list)) &
grepl(d, names(lung.list)) &
grepl("simulated", names(lung.list))][[1]], "exprs") %>%
as.data.frame %>%
mutate(protein=rownames(.)) %>%
melt() %>%
dplyr::rename(cell=variable, expr=value)
cluster_sim.exprs <- all.clusters %>%
dplyr::filter(k==i & dataset==d) %>%
merge(cluster_sim.exprs) %>%
dplyr::select(-c("k", "max_ari", "dataset", "celltype"))
cluster_sim.exprs <- cluster_sim.exprs %>%
mutate(simulated=paste("sim", simulated)) %>%
group_by(simulated, protein) %>%
summarise(mean_expr=mean(expr)) %>%
pivot_wider(names_from=simulated, values_from=mean_expr) %>%
column_to_rownames("protein") %>%
as.matrix
# Get barplot annotations of celltypes
anno.list <- get_anno_clusters(all.clusters, i, d, direction="normal", which_sim="column")
col_anno <- anno.list[[1]]
row_anno <- anno.list[[2]]
# Create heatmap
cluster.exprsPlotGt <- Heatmap(cluster_gt.exprs,
show_row_dend=FALSE,
show_column_dend=FALSE,
top_annotation=col_anno,
col=magma(100),
column_title="Ground truth",
row_names_gp=gpar(fontsize = 8),
column_names_gp=gpar(fontsize=8),
column_title_gp=gpar(fontsize=10),
rect_gp=gpar(col="white", lwd=1),
heatmap_legend_param=list(title="Ground truth"))
cluster.exprsPlotSim <- Heatmap(cluster_sim.exprs,
show_row_dend=FALSE,
show_column_dend=FALSE,
top_annotation=row_anno,
col=magma(100),
column_title="Simulated",
row_names_gp=gpar(fontsize = 8),
column_names_gp=gpar(fontsize=8),
column_title_gp=gpar(fontsize=10),
rect_gp=gpar(col="white", lwd=1),
heatmap_legend_param=list(title="Simulated"))
# Add heatmap to list
ht_list.exprs <- append(ht_list.exprs,
list(grid.grabExpr(draw(cluster.exprsPlotGt + cluster.exprsPlotSim,
heatmap_legend_list=list(
Legend(labels=names(celltypes_col),
title="Celltypes",
legend_gp=gpar(fill=celltypes_col))),
column_title=d))))
}
print(plot_grid(plotlist=ht_list.exprs, nrow=1))
cat("\n\n")
}
Finally, we compute the adjusted random index (ARI) between the clustering of the ground truth and simulated data to assess the overlap.
all.clusters <- all.clusters %>%
ungroup %>%
mutate(point.x=ifelse(k=="3", -0.25, 0.25) + as.numeric(as.factor(dataset)))
ari.plot <- ggplot(all.clusters %>%
group_by(dataset, k) %>%
summarize(ARI=ARI(simulated, ground_truth),
point.x=point.x,
max_ari=max_ari)) +
geom_bar(aes(x=dataset, y=ARI, fill=k), position="dodge", stat="identity") +
geom_point(aes(x=point.x, y=max_ari)) +
theme_cowplot() +
scale_fill_npg() +
coord_flip() +
ylim(0, 1)
print(ari.plot)